library(MASS)
library(haven)
library(Matrix)
library(lfe)
library(ggplot2)
library(RColorBrewer)
library(Hotelling)
library(did)

LutzData <- read_dta("ros_ccd_aejp.dta")

### I followed Lutz (2011)
DataCleaning <- list(start_year=88,
                     end_year=107,
                     district_size=10000,
                     T0=11
)

### make Boston, Detroit and Fort Worth into non-dismissal districts
LutzData$cdum[LutzData$leaid==2612000 | LutzData$leaid == 2502790 | 
                LutzData$leaid == 4819700] <- 0
LutzData$cdum_ever[LutzData$leaid==2612000 | LutzData$leaid == 2502790 | 
                     LutzData$leaid == 4819700] <- 0
LutzData$cdum_time[LutzData$leaid==2612000 | LutzData$leaid == 2502790 | 
                     LutzData$leaid == 4819700] <- 0
LutzData$underc[LutzData$leaid==2612000 | LutzData$leaid == 2502790 | 
                  LutzData$leaid == 4819700] <- 0

### multiply dissimilarity index by 100
LutzData$disd <- LutzData$disd*100

### following Lutz
# drop never-court-mandated school districts
# underc: under court order as of 1991
# ros: from the Rossell and Armor sample
# a_raceflag: insufficient race data
# totbase: n of students in 1988
### drop Holmes county consolidated school district
### for missing dissimilarity index: 2801980
LutzData <- LutzData[LutzData$ros==1 & LutzData$underc==1 & 
                       is.na(LutzData$underc)==FALSE &
                       LutzData$a_raceflag!=1 & 
                       LutzData$totbase>DataCleaning$district_size &
                       LutzData$leaid!=2801980,]

# a: first year of panel
# b: last year of panel
LutzData <- LutzData[LutzData$year >= DataCleaning$start_year & 
                       LutzData$year <= DataCleaning$end_year,]

### set arbitrarily large treatment timing for never-treated units
LutzData$cdum_time[LutzData$cdum_ever==0] <- 1000

### keep units that are observed for every period
LutzData <- 
  LutzData[sapply(LutzData$leaid,  
                  function(x) 
                    nrow(LutzData[LutzData$leaid==x,])==
                    (DataCleaning$end_year-DataCleaning$start_year+1)),]

### rename index variables and recenter them 
### drop earlier-treated units
colnames(LutzData)[c(1,2,11,48)] <- c("time", "id","E","weight")
LutzData$time <- LutzData$time - DataCleaning$start_year 
LutzData$E <- LutzData$E - DataCleaning$start_year 
LutzData <- LutzData[LutzData$E>DataCleaning$T0,] 
temp <- unique(LutzData$id)
LutzData$id <- sapply(LutzData$id,function(x) which(x==temp))
remove(temp)

T <- DataCleaning$end_year - DataCleaning$start_year
N <- length(unique(LutzData$id))
LutzData$E[LutzData$E>T] <- T+1

colnames(LutzData)[c(41,50)] <- c("n_student", "p_freelunch")

### construct a first-differenced dataset for Kmeans ###
### the order is Y, id, time, treatment timing, weight, X
KmeansData <- LutzData[,c(3,2,1,11,48,
                          49,54,56,41,50)]
KmeansData$weight <- 1

### take first differences
KmeansData$diff_disd <- 0
for (i in 1:N){
  for (t in 1:T){
    KmeansData$diff_disd[(KmeansData$id==i)*(KmeansData$time==t)==1] <- 
      KmeansData$disd[(KmeansData$id==i)*(KmeansData$time==t)==1] - 
      KmeansData$disd[(KmeansData$id==i)*(KmeansData$time==(t-1))==1]
  }
}
remove(i,t)

save.image("data_cleaned.RData")
